## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.3     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose

#Body condition factor is BCF = ((body weight(g) - gut content(g)) / length(cm)^3) *100.

## Rows: 21 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): Site
## dbl (2): Site_Location, Body_Condition_Factor
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(Site_BCF,aes(Body_Condition_Factor,Site)) +
  geom_point() +
  geom_smooth(method="glm", method.args=list(family="binomial"(link="logit")))+
  geom_hline(yintercept = mean(Site_BCF$Site), linetype = "dashed") +
  labs(title="GLM, binomial (DF/WB)")
## Warning in mean.default(Site_BCF$Site): argument is not numeric or logical:
## returning NA
## `geom_smooth()` using formula 'y ~ x'
## Warning: Computation failed in `stat_smooth()`:
## y values must be 0 <= y <= 1
## Warning: Removed 1 rows containing missing values (geom_hline).

#Binomial showing Body Condition Factor values of individuals #across two study sites: Wall Branch (WB) creek system and Dry Fork (DF), Whites creek system.

## Rows: 21 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): Site_Location, Body_Condition_Factor
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Binary

#Here, the two sites are assigned as 1= Wall Branch and 0= Dry Fork #Scatter plot with geom_smooth applied

ggplot(Site_BodyConditionFactor,aes(Body_Condition_Factor,Site_Location)) +
  geom_point() +
  geom_smooth() +
  xlab ("Body Condition Factor") +
  ylab ("Site") +
  labs(title="Raw Fit: 1=Wall Branch, 0=Dry Fork")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Body Condition Factor values range from 0.64 to 0.93, values divided 100.

Site_BodyConditionFactor$BCF100 <- Site_BodyConditionFactor$Body_Condition_Factor/100
fit.1 <- glm(Site_Location~BCF100, data=Site_BodyConditionFactor, binomial(link="logit"))
anova(fit.1)
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Site_Location
## 
## Terms added sequentially (first to last)
## 
## 
##        Df Deviance Resid. Df Resid. Dev
## NULL                      20     29.064
## BCF100  1   0.2005        19     28.864
summary(fit.1)
## 
## Call:
## glm(formula = Site_Location ~ BCF100, family = binomial(link = "logit"), 
##     data = Site_BodyConditionFactor)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.249  -1.159  -1.008   1.212   1.371  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)    1.967      4.650   0.423    0.672
## BCF100      -257.309    577.880  -0.445    0.656
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 29.065  on 20  degrees of freedom
## Residual deviance: 28.864  on 19  degrees of freedom
## AIC: 32.864
## 
## Number of Fisher Scoring iterations: 4

#For the summary of the binomial, we see that there is a negative effect of body condition factor.

autoplot(fit.1)

#This dataset is not good for use in binary test; we can see there is no type of logistic curve when looking at the Normal Q-Q in the autoplots. Data is nearly identical among the sites.

#Next to binnedplot

library(arm)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loading required package: lme4
## 
## arm (Version 1.11-2, built: 2020-7-27)
## Working directory is C:/Users/ridew/OneDrive/Documents/Advanced Analytics/GLM_Assignment/GLMAssignment
x <- predict(fit.1)
y <- resid(fit.1)
binnedplot(x, y)

#The results of the binnedplot, show that binned residuals of this data do not present good binary data. #The binned residual plot is used to view points that fall into +/- 2 standard errors; ~95% of the binned residuals.

coef(fit.1)
## (Intercept)      BCF100 
##    1.966753 -257.308577
confint(fit.1)
## Waiting for profiling to be done...
##                    2.5 %   97.5 %
## (Intercept)    -7.210625  11.7298
## BCF100      -1470.488665 880.3547
ggplot(Site_BodyConditionFactor, aes(BCF100,Site_Location)) +
  geom_point() +
  geom_smooth(method="glm", method.args=list(family="binomial"(link="logit"))) + 
  xlab ("Body Condition Factor") +
  ylab ("Site") +
  labs (title="Raw Fit: 1=Wall Branch, 0=Dry Fork")
## `geom_smooth()` using formula 'y ~ x'

#Logit is on the log scale; use invlogit.

invlogit <- function(x) {1 / ( 1+exp(-x) ) } 
invlogit(coef(fit.1)) 
##   (Intercept)        BCF100 
##  8.772620e-01 1.787741e-112

#The body condition values were nearly identical across both sites. Body Condition Factor is not a good predictor of what population (Dry Fork vs. Wall Branch) the individual belongs to.

#BCF100 ^-112 is very close to 0, further supporting this data is nearly identical in Body Condition Factor values at each site and not good data for use in GLM binary.

Figure 1. Length Measurements

Figure 2. Fathead Minnow Gonads (Testes)

#Dataset looking at Gonadosomatic Index (GSI) and Red Coloration Area

library(readr)
GSI_RedColorationArea <- read_csv("data/GSI_RedColorationArea.csv")
## Rows: 21 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): GSI_Value, Red_Coloration_Area
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

#First view on basic lm

ggplot(data = GSI_RedColorationArea, aes(x=GSI_Value, y=Red_Coloration_Area)) +
geom_point()+
  xlab("GSI") +
    ylab("Red Coloration %") +
theme_bw()+
geom_smooth(method=lm)
## `geom_smooth()` using formula 'y ~ x'

ggsave("outputs/ScatterGSI_Area.png")
## Saving 7 x 5 in image
## `geom_smooth()` using formula 'y ~ x'
FitGSI_Area <- lm(data= GSI_RedColorationArea, Red_Coloration_Area~GSI_Value)
autoplot(FitGSI_Area)

anova(FitGSI_Area)
## Analysis of Variance Table
## 
## Response: Red_Coloration_Area
##           Df  Sum Sq Mean Sq F value Pr(>F)
## GSI_Value  1   13.37  13.368  0.2345 0.6337
## Residuals 19 1083.04  57.002
summary(FitGSI_Area)
## 
## Call:
## lm(formula = Red_Coloration_Area ~ GSI_Value, data = GSI_RedColorationArea)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.129  -4.093   0.079   4.519  13.015 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  14.0913     2.5593   5.506  2.6e-05 ***
## GSI_Value    -0.1511     0.3120  -0.484    0.634    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.55 on 19 degrees of freedom
## Multiple R-squared:  0.01219,    Adjusted R-squared:  -0.0398 
## F-statistic: 0.2345 on 1 and 19 DF,  p-value: 0.6337
boxcox(FitGSI_Area)

# #see linear regression for FitGSI_Area

#These results show lambda value ~ 0.7.

#The R output for the boxcox() function plots the maximum likelihood surface (the curve) together with a maximum likelihood-based 95% CI (Hector, 2015).

#Helpful GLM component info from Hector, 2015 Ch8 #GLMs have three components: # 1.) a linear predictor- is what comes after the tilde (~) in our linear model formula # 2.) a variance function - models the variation in the data;make use of a much wider rangefamily of distributions including the poisson, the binomial, and the gamma. #3.) a link function- plays a role equivalent to the transformation in normal least squares models. However, rather than transforming the data we transform the predictions made by the linear predictor. Commonly used link functions include the log, square root, and logistic.

#Gaussian with default link

GLMGSI_Area <- glm(Red_Coloration_Area~GSI_Value, data= GSI_RedColorationArea, family= gaussian(link=)) 
ggplot(GSI_RedColorationArea,aes(GSI_Value,Red_Coloration_Area)) +
  geom_point() +
  geom_smooth(method="glm",colour="red",
                       method.args=list(family="gaussian"(link=)))+
  labs(title="GLM, Gaussian")
## `geom_smooth()` using formula 'y ~ x'

autoplot(GLMGSI_Area)

anova(GLMGSI_Area)
## Analysis of Deviance Table
## 
## Model: gaussian, link: identity
## 
## Response: Red_Coloration_Area
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev
## NULL                         20     1096.4
## GSI_Value  1   13.368        19     1083.0
summary(GLMGSI_Area)
## 
## Call:
## glm(formula = Red_Coloration_Area ~ GSI_Value, family = gaussian(link = ), 
##     data = GSI_RedColorationArea)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -11.129   -4.093    0.079    4.519   13.015  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  14.0913     2.5593   5.506  2.6e-05 ***
## GSI_Value    -0.1511     0.3120  -0.484    0.634    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 57.00198)
## 
##     Null deviance: 1096.4  on 20  degrees of freedom
## Residual deviance: 1083.0  on 19  degrees of freedom
## AIC: 148.4
## 
## Number of Fisher Scoring iterations: 2

#Overdispersed: Residual deviance 1083 on 19 degrees of freedom

#Next use of Family: Gamma with link:log

gammalog <- glm(Red_Coloration_Area~GSI_Value, data= GSI_RedColorationArea, family=Gamma(link= "log"))
ggplot(GSI_RedColorationArea,aes(GSI_Value,Red_Coloration_Area)) +
  geom_point() +
  geom_smooth(method="glm",colour="red",
                       method.args=list(family="Gamma"(link="log")))+
  labs(title="GLM, log link, Gamma variance")
## `geom_smooth()` using formula 'y ~ x'

anova(gammalog)
## Analysis of Deviance Table
## 
## Model: Gamma, link: log
## 
## Response: Red_Coloration_Area
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev
## NULL                         20     9.2373
## GSI_Value  1  0.08659        19     9.1508
summary(gammalog)
## 
## Call:
## glm(formula = Red_Coloration_Area ~ GSI_Value, family = Gamma(link = "log"), 
##     data = GSI_RedColorationArea)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.33051  -0.33911   0.00496   0.29534   0.75609  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.65341    0.19319  13.735 2.57e-11 ***
## GSI_Value   -0.01268    0.02355  -0.538    0.597    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.324797)
## 
##     Null deviance: 9.2373  on 20  degrees of freedom
## Residual deviance: 9.1508  on 19  degrees of freedom
## AIC: 148.62
## 
## Number of Fisher Scoring iterations: 5

#For summary of Gamma with link log function, we see a residual deviance of 9.15 on 19 degrees of freedom, data is underdispersed. P-value=0.597

coef(gammalog)
## (Intercept)   GSI_Value 
##  2.65341089 -0.01268145

For every one unit increase in GSI there is a 0.01268145 decrease in red coloration area.

confint(gammalog)
## Waiting for profiling to be done...
##                  2.5 %     97.5 %
## (Intercept)  2.2800283 3.05155112
## GSI_Value   -0.0578871 0.03754406
leaflet() %>%
  setView(-86.854396, 36.26361 , zoom = 16) %>% #lat-long of the place of interest
  addTiles() %>%
  addProviderTiles('Esri.WorldImagery') %>%
  addMarkers(-86.854396, 36.26361 , popup = "Dry Fork, Whites Creek System")
leaflet() %>%
  setView(-87.287965, 36.499277 , zoom = 16) %>% #lat-long of the place of interest
  addTiles() %>%
  addProviderTiles('Esri.WorldImagery') %>%
  addMarkers(-87.287965, 36.499277 , popup = "Rotary Park, Wall Branch Creek System")

Figure 3. Southern Redbelly Dace in Full Breeding Coloration (Chrosomus erythrogaster)